setwd("d:/PORES Dropbox/Stephen Pettigrew/ranked-choice-voting/replication files/")

require(tidyverse)
options(scipen = 999)
source("code/geography-colors.R")



raw <- readRDS("final-data/all-errors-and-votes.RDS")

errors <- raw$errors %>%
  filter(rcv) %>%
  filter(bubble.pattern != "") %>%
  mutate(any.error = overvote | overrank | skip) %>%
  merge(
    raw$offices %>%
      select(geography, date, office.id, office),
    by = c("geography","date","office.id"),
    all.x = T)


# Error rates by office-----

summarize.rates <- function(x, include.count = F){
  z <- x %>%
    summarize(count = n(),
              error.rate = mean(any.error),
              overrank.rate = mean(overrank),
              overvote.rate = mean(overvote),
              skips = mean(skip),
              .groups = "drop")
  if(!include.count) z$count <- NULL
  return(z)
}

errors.by.office <- errors %>%
  group_by(geography, date, office.id, office) %>%
  summarize.rates() %>%
  ungroup() %>%
  gather(-c(geography, date, office.id, office), key = "error", value = "pct") %>%
  mutate(error2 = gsub("error.rate", "Mismark of\nany type", error),
         error2 = gsub("overrank.rate", "Overranked\na candidate", error2),
         error2 = gsub("overvote.rate", "Overvoted\na ranking", error2),
         error2 = gsub("skips", "Front or\ninterior skip", error2),
         error2 = factor(error2, 
                         levels = c("Mismark of\nany type",
                                    "Front or\ninterior skip",
                                    "Overranked\na candidate",
                                    "Overvoted\na ranking")))

errors.by.geo <- errors.by.office %>%
  group_by(geography, error, error2) %>%
  summarize(median = median(pct),
            se = sd(pct) / sqrt(n()),
            pct = mean(pct),
            lower = pct + qnorm(.025) * se,
            upper = pct + qnorm(.975) * se) %>%
  mutate(geography = factor(geography, levels = names(colors))) %>%
  ungroup()

errors.by.geo.date <- errors.by.office %>%
  group_by(geography, error, error2, date) %>%
  summarize(median = median(pct),
            se = sd(pct) / sqrt(n()),
            pct = mean(pct),
            lower = pct + qnorm(.025) * se,
            upper = pct + qnorm(.975) * se) %>%
  mutate(geography = factor(geography, levels = names(colors))) %>%
  ungroup()

errors.overall <- errors.by.office %>%
  group_by(error, error2) %>%
  summarize(median = median(pct),
            se = sd(pct) / sqrt(n()),
            pct = mean(pct),
            lower = pct + qnorm(.025) * se,
            upper = pct + qnorm(.975) * se,
            pct.label = sprintf("%.1f%%", pct * 100)) %>%
  ungroup()


errors.overall # top-line error rates to report in text of paper
errors.by.geo # error rates by geography
errors.by.geo.date # error rates by geography and date


## Figure: Error rates by office-----

ggplot() + 
  
  # dots for each election
  geom_point(mapping = aes(x = error2, y = pct, color = geography, shape = geography),
             data = errors.by.office,
             position = position_dodge(width = .5),
             alpha = .2) + 
  
  # black grand mean bar
  geom_errorbar(mapping = aes(ymin = pct, 
                              ymax = pct, 
                              width = .6, # horiz width
                              x = error2),
                size = .6, # vert thickness
                #linetype = "dashed",
                data = errors.overall) + 
  geom_errorbar(mapping = aes(ymin = lower, 
                              ymax = upper, 
                              x = error2,
                              width = 0),
                data = errors.overall,
                size = .4) + 
  
  # colored bars for each state avg
  geom_errorbar(mapping = aes(ymin = pct, 
                              ymax = pct, 
                              x = error2,
                              width = .4, # horiz width
                              color = geography), 
                data = errors.by.geo,
                linewidth = .5, # vert thickness
                position = position_dodge(width = .5)) + 
  geom_text(mapping = aes(y = pct,
                          x = error2,
                          label = pct.label),
            data = errors.overall, nudge_x = .45) + 
  
  scale_y_continuous("Percent of voters who mismarked their ballot",
                     labels = scales::percent_format(1),
                     breaks = seq(0,1,.02)) + 
  xlab("") + 
  scale_color_manual("", values = colors) + 
  scale_shape_manual("", values = shapes) + 
  theme_bw() +
  guides(color = guide_legend(override.aes = list(alpha = 1, linetype = 0))) + 
  
  # scale_y_break has a bug that adds a second y-axis on right. this removes it:
  theme(axis.text.y.right = element_blank(),
        axis.line.y.right = element_blank(),
        axis.ticks.y.right = element_blank())
